We're going to download recent crimes from the City of Chicago's data portal.

Ultimately, we're going to construct a query using the REST API of Socrata, the service/software used to host the city's data. This data set has changed in the past, so before we build our query, let's get a list of the column names so we can figure out which column to filter on to find recent crimes.


In [ ]:
import json

import requests

CRIME_SOCRATA_VIEW_ID = 'ijzp-q8t2'

def get_data_portal_url(view_id):
    return 'http://data.cityofchicago.org/api/views/{view_id}'.format(
      view_id=view_id)

def get_dataset_columns(view_id):
    """
    Get dataset field names from the Socrata API

    Returns:
        A dictionary that acts as a lookup table from column ID to column name

    """
    url = get_data_portal_url(view_id)
    meta_response = requests.get(url)
    if not meta_response.ok:
        meta_response.raise_for_status()

    meta = meta_response.json()
    return {c['id']: c['name'] for c in meta['columns']}

columns = get_dataset_columns(CRIME_SOCRATA_VIEW_ID)

for column_id, name in columns.items():
    print("{}: {}".format(column_id, name))

It looks like the column named "Date" with an ID of "154418879" is the one we want.


In [9]:
date_column_id, date_column_name = next((i, n) for i, n in columns.items() if n.lower() == "date")
print("Date column ID: {}".format(date_column_id))


Date column ID: 154418879

The response data contains column IDs rather than column names. Let's build a lookup table to convert them later, and a helper function to "fix" our rows that we get from the API.


In [30]:
def slugify(s, replacement='_'):
    return s.replace(' ', replacement).lower()

def get_clean_column_lookup(column_lookup):
    return {str(i): slugify(n) for i, n in column_lookup.items()}

human_columns = get_clean_column_lookup(columns)
import pprint
pprint.pprint(human_columns)

def humanize_columns(row, column_lookup):
    humanized = {}
    for column_id, value in row.items():
        try:
            humanized[column_lookup[column_id]] = value
        except KeyError:
            humanized[column_id] = value
    
    return humanized


{'154418877': 'id',
 '154418878': 'case_number',
 '154418879': 'date',
 '154418880': 'block',
 '154418881': 'iucr',
 '154418882': 'primary_type',
 '154418883': 'description',
 '154418884': 'location_description',
 '154418885': 'arrest',
 '154418886': 'domestic',
 '154418887': 'beat',
 '154418888': 'district',
 '154418889': 'ward',
 '154418890': 'community_area',
 '154418891': 'fbi_code',
 '154418892': 'x_coordinate',
 '154418893': 'y_coordinate',
 '154418894': 'year',
 '154418895': 'updated_on',
 '154418896': 'latitude',
 '154418897': 'longitude',
 '154418898': 'location'}

Let's build a query for the Socrata API.


In [11]:
from datetime import date, timedelta

def build_query(since_date, date_column_id, view_id):
    """
    Get a Socrata API query for all records updated after the last update

    Args:
       since_date (datetine.date): date object. All crimes since this date will be retrieved.
       date_column_id (str): String containing the column ID for the dates we'll filter on
       view_id (str): Socrata view ID for this dataset

    Returns:
        Dictionary that can be serialized into a JSON sring used as the POST
        body to the Socrata API

    """

    query = {
        'originalViewId': view_id,
        'name': 'inline filter',
        'query' : {
            'filterCondition': {
                'type': 'operator',
                'value': 'AND',
                'children' : [{
                    'type' : 'operator',
                    'value' : 'GREATER_THAN',
                    'children': [{
                        'columnId' : date_column_id,
                        'type' : 'column',
                    }, {
                        'type' : 'literal',
                        'value' : since_date.strftime('%Y-%m-%d'),
                    }],
                }],
            },
        }
    }
    return query

# Months are different lenghts.  Let's just find the date 30 days ago
today = date.today()
date_30_days_ago = today - timedelta(days=30)
query = build_query(date_30_days_ago, date_column_id, CRIME_SOCRATA_VIEW_ID)

import pprint
print("The query looks like this: ")
pprint.pprint(query)


The query looks like this: 
{'name': 'inline filter',
 'originalViewId': 'ijzp-q8t2',
 'query': {'filterCondition': {'children': [{'children': [{'columnId': 154418879,
                                                           'type': 'column'},
                                                          {'type': 'literal',
                                                           'value': '2016-03-06'}],
                                             'type': 'operator',
                                             'value': 'GREATER_THAN'}],
                               'type': 'operator',
                               'value': 'AND'}}}

Now let's request the data from the API, using our query


In [32]:
import json
import requests

def get_rows_url(start, count):
    url_tpl = "https://data.cityofchicago.org/api/views/INLINE/rows.json?method=getRows&start={start}&length={length}"
    return url_tpl.format(
      start=start,
      length=count
    )

def get_rows(query, start=0, count=1000):
    url = get_rows_url(start, count)
    headers = { 'content-type' : 'application/json' }
    response = requests.post(url, data=json.dumps(query), headers=headers, verify=False)
    return response.json()
    
def transform_row(row, transforms):
    transformed_row = row
    for transform in transforms:
        transformed_row = transform(transformed_row)
    
    return transformed_row
    
def get_all_rows(query, transforms=[]):
    continue_fetching = True
    page_size = 1000
    start = 0
    
    while continue_fetching:
        rows = get_rows(query, start, page_size)
        if len(rows) < page_size:
            continue_fetching = False
            
        start += page_size
        
        for row in rows:
            yield(transform_row(row, transforms))
        
crimes = list(get_all_rows(query, transforms=[lambda r: humanize_columns(r, human_columns)]))


/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)
/Users/ghing/venvs/public-notebooks/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py:791: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html
  InsecureRequestWarning)

In [37]:
import pprint

print("There are {} crimes since {}".format(len(crimes), date_30_days_ago.strftime("%Y-%m-%d")))

print("The first one looks like: ")
pprint.pprint(crimes[0])


There are 15505 crimes since 2016-03-06
The first one looks like: 
{'arrest': False,
 'beat': '0711',
 'block': '055XX S WELLS ST',
 'case_number': 'HZ176211',
 'community_area': '68',
 'created_at': 1457954936,
 'created_meta': '878752',
 'date': '2016-03-06T00:20:00',
 'description': 'COUNTERFEITING DOCUMENT',
 'district': '007',
 'domestic': False,
 'fbi_code': '10',
 'id': 'ACB1B918-2EB2-40A4-B826-2A6FAD0EEC87',
 'iucr': '1121',
 'latitude': '41.794088699',
 'location': {'human_address': '{"address":"","city":"","state":"","zip":""}',
              'latitude': '41.794088699',
              'longitude': '-87.631799109',
              'needs_recoding': False},
 'location_description': 'GAS STATION',
 'longitude': '-87.631799109',
 'position': 12529105,
 'primary_type': 'DECEPTIVE PRACTICE',
 'sid': 12529105,
 'updated_at': 1457954936,
 'updated_meta': '878752',
 'updated_on': '2016-03-13T15:49:08',
 'ward': '3',
 'x_coordinate': '1175550',
 'y_coordinate': '1868337',
 'year': '2016'}

In [ ]: